Source Code of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunsMerger

/*
 * Terrier - Terabyte Retriever
 * Webpage: http://terrier.org
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.uk
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is HadoopRunsMerger.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Richard McCreadie <richardm{a.}dcs.gla.ac.uk> (original author)
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
 *   
 */
package org.terrier.structures.indexing.singlepass.hadoop;


import java.io.IOException;
import java.util.LinkedList;
import java.util.ListIterator;


import org.terrier.structures.BasicLexiconEntry;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.indexing.singlepass.PostingInRun;
import org.terrier.structures.indexing.singlepass.RunIterator;
import org.terrier.structures.indexing.singlepass.RunIteratorFactory;
import org.terrier.structures.indexing.singlepass.RunsMerger;


/**
 * This is the main merger class for Hadoop runs. It provides functionality for
 * the merging of lexicons and inverted index shards from the map task indexers.
 * @since 2.2
 * @author Richard McCreadie and Craig Macdonald
  */
public class HadoopRunsMerger extends RunsMerger {


  /** The data loaded from side-effect files about each map task */ 
  protected LinkedList<MapData> mapData = null;
  
  /** Number of Reducers Used*/
  protected int numReducers = 0;
  /**
   * Constructs an instance of HadoopRunsMerger.
   * @param _runsSource
   */
  public HadoopRunsMerger(RunIteratorFactory _runsSource) {
    super(_runsSource);
  }


  /**
   * Alternate Merge operation for merging a linked list of runs of the form
   * Hadoop_MapData. This routine merges the multiple runs
   * created during the map process of hadoop indexing as such it corrects for
   * Document id 'shift' caused by random splitting of runs due to flushing and
   * map splitting.
   * @param _mapData - information about the number of documents per map and run. One element for every map.
   * @throws IOException
   */
  public void beginMerge(LinkedList<MapData> _mapData)
  {
    mapData = _mapData;
  }
  /** 
   * {@inheritDoc} 
   */
  public void endMerge(LexiconOutputStream<String> lexStream) {}
  /** 
   * {@inheritDoc} 
   */
  public void mergeOne(LexiconOutputStream<String> lexStream) throws Exception
  {  
    int maxDF = 0;
    RunIterator run = runsSource.createRunIterator(-1);
    HadoopRunPostingIterator _run = (HadoopRunPostingIterator)run;
    
    lastTermWritten = null;
    lastFreq = 0;
    lastDocFreq= 0;
    lastDocument = -1;
    long startOffset = this.getByteOffset();
    byte startBitOffset = this.getBitOffset();
    LexiconEntry le = null;
    // for each run in the list 
    int counter = 0;
    //for one term: for each set of postings for that term
    while (run.hasNext()) {
      PostingInRun posting = run.next();
      lastTermWritten = posting.getTerm();
      
      if (posting.getDf() > maxDF) 
        maxDF = posting.getDf();
      
      //final int _runMapID = TaskID.forName(_run.getMapNo()).getId();
      //final int runNumber = run.getRunNo();
      final int docOffset = getDocumentOffset(_run.getSplitNo(), _run.getRunNo());
      lastDocument = posting.append(bos, lastDocument, docOffset);
      if (le == null)
        le = posting.getLexiconEntry();
      else
        posting.addToLexiconEntry(le);
      lastFreq += posting.getTF();
      lastDocFreq += posting.getDf();
      counter++;
    }
    le.setTermId(currentTerm++);
    ((BasicLexiconEntry)le).setOffset(startOffset, startBitOffset);
    lexStream.writeNextEntry(lastTermWritten, le);
    numberOfPointers += lastDocFreq;
  }
  
  /** Gets the number of Reducers to Merge for:
   * 1 for single Reducer,
   * &gt;1 for multi-Reducers
   * @return how many reducers are in use.
   */
  public int getNumReducers() {
    return numReducers;
  }


  /** Sets the number of Reducers to Merge for:
   * 1 for single Reducer,
   * &gt;1 for multi-Reducers
   */
  public void setNumReducers(int _numReducers) {
    this.numReducers = _numReducers;
  }
  /** 
   * Get the offset for the document based on a split and flush.
   */
  public int getDocumentOffset(int splitNo, int flushNumber) throws IOException {
    int NumPreDocs = 0;
    MapData correctHRD = null;
    for (MapData tempHRD : mapData)
    {
      if (splitNo == tempHRD.getSplitnum() ) {
        //System.out.println("Reducer number : "+reduceNumber+", Splitnum"+tempSplitnum+", Run Map Number : "+_run.getMapNo());
        correctHRD = tempHRD;
        break;
      }
      NumPreDocs += tempHRD.getMapDocs();
    }
    if (correctHRD == null)
      throw new IOException("Did not find map data for split "+ splitNo);
    
    // Add the FlushShift
    int currentFlushDocs=0;
    ListIterator<Integer> LI = correctHRD.getFlushDocSizes().listIterator(0);
    //System.out.println("Runs Flush number : "+run.getRunNo()+", Size of HRD :"+correctHRD.getFlushDocSizes().size());
    int currentFlush =0;
    while (currentFlush<flushNumber) {
      //System.out.println("Runs Flush number : "+run.getRunNo()+", FlushCheck : "+currentFlush+", Size of HRD :"+correctHRD.getFlushDocSizes().size());
      currentFlushDocs += LI.next(); 
      currentFlush++;
    }
    
    return NumPreDocs+currentFlushDocs;
  }


}
Source Code of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunsMerger

Related Classes of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunsMerger